import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
hospital = pd.read_csv("Hospitalisation details.csv")
medical = pd.read_csv("Medical Examinations.csv")
customer = pd.read_excel("Names.xlsx")
hospital.head()
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | tier - 2 | tier - 3 | R1013 |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | tier - 2 | tier - 1 | R1013 |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | tier - 2 | tier - 1 | R1013 |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | tier - 3 | tier - 3 | R1013 |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | tier - 3 | tier - 3 | R1013 |
hospital.shape
(2343, 9)
hospital.describe()
| date | children | charges | |
|---|---|---|---|
| count | 2343.000000 | 2343.000000 | 2343.000000 |
| mean | 15.554844 | 1.026035 | 13559.067870 |
| std | 8.721194 | 1.233847 | 11922.658415 |
| min | 1.000000 | 0.000000 | 563.840000 |
| 25% | 8.000000 | 0.000000 | 5084.010000 |
| 50% | 15.000000 | 0.000000 | 9634.540000 |
| 75% | 23.000000 | 2.000000 | 17029.675000 |
| max | 30.000000 | 5.000000 | 63770.430000 |
hospital.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2343 entries, 0 to 2342 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2343 non-null object 1 year 2343 non-null object 2 month 2343 non-null object 3 date 2343 non-null int64 4 children 2343 non-null int64 5 charges 2343 non-null float64 6 Hospital tier 2343 non-null object 7 City tier 2343 non-null object 8 State ID 2343 non-null object dtypes: float64(1), int64(2), object(6) memory usage: 164.9+ KB
medical.head()
| Customer ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | |
|---|---|---|---|---|---|---|---|---|
| 0 | Id1 | 47.410 | 7.47 | No | No | No | No major surgery | yes |
| 1 | Id2 | 30.360 | 5.77 | No | No | No | No major surgery | yes |
| 2 | Id3 | 34.485 | 11.87 | yes | No | No | 2 | yes |
| 3 | Id4 | 38.095 | 6.05 | No | No | No | No major surgery | yes |
| 4 | Id5 | 35.530 | 5.45 | No | No | No | No major surgery | yes |
medical.shape
(2335, 8)
medical.describe()
| BMI | HBA1C | |
|---|---|---|
| count | 2335.000000 | 2335.000000 |
| mean | 30.972649 | 6.578998 |
| std | 8.742095 | 2.228731 |
| min | 15.010000 | 4.000000 |
| 25% | 24.600000 | 4.900000 |
| 50% | 30.400000 | 5.810000 |
| 75% | 36.300000 | 7.955000 |
| max | 55.050000 | 12.000000 |
medical.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2335 entries, 0 to 2334 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2335 non-null object 1 BMI 2335 non-null float64 2 HBA1C 2335 non-null float64 3 Heart Issues 2335 non-null object 4 Any Transplants 2335 non-null object 5 Cancer history 2335 non-null object 6 NumberOfMajorSurgeries 2335 non-null object 7 smoker 2335 non-null object dtypes: float64(2), object(6) memory usage: 146.1+ KB
customer.head()
| Customer ID | name | |
|---|---|---|
| 0 | Id1 | Hawks, Ms. Kelly |
| 1 | Id2 | Lehner, Mr. Matthew D |
| 2 | Id3 | Lu, Mr. Phil |
| 3 | Id4 | Osborne, Ms. Kelsey |
| 4 | Id5 | Kadala, Ms. Kristyn |
customer.shape
(2335, 2)
customer.describe()
| Customer ID | name | |
|---|---|---|
| count | 2335 | 2335 |
| unique | 2335 | 2335 |
| top | Id1 | Hawks, Ms. Kelly |
| freq | 1 | 1 |
customer.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2335 entries, 0 to 2334 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 2335 non-null object 1 name 2335 non-null object dtypes: object(2) memory usage: 36.6+ KB
df = pd.merge(pd.merge(hospital,medical,on='Customer ID'),customer,on='Customer ID')
df
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | tier - 2 | tier - 3 | R1013 | 17.580 | 4.51 | No | No | No | 1 | No | German, Mr. Aaron K |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | tier - 2 | tier - 1 | R1013 | 17.600 | 4.39 | No | No | No | 1 | No | Rosendahl, Mr. Evan P |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | tier - 2 | tier - 1 | R1013 | 16.470 | 6.35 | No | No | Yes | 1 | No | Albano, Ms. Julie |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | tier - 3 | tier - 3 | R1013 | 17.700 | 6.28 | No | No | No | 1 | No | Riveros Gonzalez, Mr. Juan D. Sr. |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | tier - 3 | tier - 3 | R1013 | 22.340 | 5.57 | No | No | No | 1 | No | Brietzke, Mr. Jordan |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2330 | Id5 | 1989 | Jun | 19 | 0 | 55135.40 | tier - 1 | tier - 2 | R1012 | 35.530 | 5.45 | No | No | No | No major surgery | yes | Kadala, Ms. Kristyn |
| 2331 | Id4 | 1991 | Jun | 6 | 1 | 58571.07 | tier - 1 | tier - 3 | R1024 | 38.095 | 6.05 | No | No | No | No major surgery | yes | Osborne, Ms. Kelsey |
| 2332 | Id3 | 1970 | ? | 11 | 3 | 60021.40 | tier - 1 | tier - 1 | R1012 | 34.485 | 11.87 | yes | No | No | 2 | yes | Lu, Mr. Phil |
| 2333 | Id2 | 1977 | Jun | 8 | 0 | 62592.87 | tier - 2 | tier - 3 | R1013 | 30.360 | 5.77 | No | No | No | No major surgery | yes | Lehner, Mr. Matthew D |
| 2334 | Id1 | 1968 | Oct | 12 | 0 | 63770.43 | tier - 1 | tier - 3 | R1013 | 47.410 | 7.47 | No | No | No | No major surgery | yes | Hawks, Ms. Kelly |
2335 rows × 17 columns
df.shape
(2335, 17)
df.isna().sum().sum()
0
trivial = df[df.eq("?").any(axis=1)]
trivial.shape
(10, 17)
round(trivial.shape[0]/df.shape[0]*100, 2)
0.43
df.drop(df[df.eq("?").any(axis=1)].index, axis=0, inplace=True)
df.shape
(2325, 17)
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
df['smoker'] = l.fit_transform(df['smoker'])
df['Any Transplants'] = l.fit_transform(df['Any Transplants'])
df['Heart Issues'] = l.fit_transform(df['Heart Issues'])
df['Cancer history'] = l.fit_transform(df['Cancer history'])
df['Cancer history'].value_counts()
Cancer history 0 1934 1 391 Name: count, dtype: int64
df['Heart Issues'].value_counts()
Heart Issues 0 1405 1 920 Name: count, dtype: int64
df['Any Transplants'].value_counts()
Any Transplants 0 2183 1 142 Name: count, dtype: int64
df['smoker'].value_counts()
smoker 0 1839 1 486 Name: count, dtype: int64
def fun(val):
return int(val.replace("tier", "").replace(" ", "").replace("-", ""))
df['Hospital tier'] = df['Hospital tier'].map(fun)
df['City tier'] = df['City tier'].map(fun)
df
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | 2 | 3 | R1013 | 17.580 | 4.51 | 0 | 0 | 0 | 1 | 0 | German, Mr. Aaron K |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | 2 | 1 | R1013 | 17.600 | 4.39 | 0 | 0 | 0 | 1 | 0 | Rosendahl, Mr. Evan P |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | 2 | 1 | R1013 | 16.470 | 6.35 | 0 | 0 | 1 | 1 | 0 | Albano, Ms. Julie |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | 3 | 3 | R1013 | 17.700 | 6.28 | 0 | 0 | 0 | 1 | 0 | Riveros Gonzalez, Mr. Juan D. Sr. |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | 3 | 3 | R1013 | 22.340 | 5.57 | 0 | 0 | 0 | 1 | 0 | Brietzke, Mr. Jordan |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2329 | Id6 | 1962 | Aug | 4 | 0 | 52590.83 | 1 | 3 | R1011 | 32.800 | 6.59 | 0 | 0 | 0 | No major surgery | 1 | Baker, Mr. Russell B. |
| 2330 | Id5 | 1989 | Jun | 19 | 0 | 55135.40 | 1 | 2 | R1012 | 35.530 | 5.45 | 0 | 0 | 0 | No major surgery | 1 | Kadala, Ms. Kristyn |
| 2331 | Id4 | 1991 | Jun | 6 | 1 | 58571.07 | 1 | 3 | R1024 | 38.095 | 6.05 | 0 | 0 | 0 | No major surgery | 1 | Osborne, Ms. Kelsey |
| 2333 | Id2 | 1977 | Jun | 8 | 0 | 62592.87 | 2 | 3 | R1013 | 30.360 | 5.77 | 0 | 0 | 0 | No major surgery | 1 | Lehner, Mr. Matthew D |
| 2334 | Id1 | 1968 | Oct | 12 | 0 | 63770.43 | 1 | 3 | R1013 | 47.410 | 7.47 | 0 | 0 | 0 | No major surgery | 1 | Hawks, Ms. Kelly |
2325 rows × 17 columns
df['State ID'].value_counts()
State ID R1013 609 R1011 574 R1012 572 R1024 159 R1026 84 R1021 70 R1016 64 R1025 40 R1023 38 R1017 36 R1019 26 R1022 14 R1014 13 R1015 11 R1018 9 R1020 6 Name: count, dtype: int64
Dummies = pd.get_dummies(df["State ID"], prefix= "State_ID")
Dummies
| State_ID_R1011 | State_ID_R1012 | State_ID_R1013 | State_ID_R1014 | State_ID_R1015 | State_ID_R1016 | State_ID_R1017 | State_ID_R1018 | State_ID_R1019 | State_ID_R1020 | State_ID_R1021 | State_ID_R1022 | State_ID_R1023 | State_ID_R1024 | State_ID_R1025 | State_ID_R1026 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2329 | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2330 | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2331 | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False |
| 2333 | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2334 | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
2325 rows × 16 columns
Dummy = Dummies[['State_ID_R1011','State_ID_R1012', 'State_ID_R1013']]
Dummy
| State_ID_R1011 | State_ID_R1012 | State_ID_R1013 | |
|---|---|---|---|
| 0 | False | False | True |
| 1 | False | False | True |
| 2 | False | False | True |
| 3 | False | False | True |
| 4 | False | False | True |
| ... | ... | ... | ... |
| 2329 | True | False | False |
| 2330 | False | True | False |
| 2331 | False | False | False |
| 2333 | False | False | True |
| 2334 | False | False | True |
2325 rows × 3 columns
df = pd.concat([df,Dummy],axis=1)
df
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | State_ID_R1011 | State_ID_R1012 | State_ID_R1013 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | Jul | 9 | 0 | 563.84 | 2 | 3 | R1013 | 17.580 | 4.51 | 0 | 0 | 0 | 1 | 0 | German, Mr. Aaron K | False | False | True |
| 1 | Id2334 | 1992 | Nov | 30 | 0 | 570.62 | 2 | 1 | R1013 | 17.600 | 4.39 | 0 | 0 | 0 | 1 | 0 | Rosendahl, Mr. Evan P | False | False | True |
| 2 | Id2333 | 1993 | Jun | 30 | 0 | 600.00 | 2 | 1 | R1013 | 16.470 | 6.35 | 0 | 0 | 1 | 1 | 0 | Albano, Ms. Julie | False | False | True |
| 3 | Id2332 | 1992 | Sep | 13 | 0 | 604.54 | 3 | 3 | R1013 | 17.700 | 6.28 | 0 | 0 | 0 | 1 | 0 | Riveros Gonzalez, Mr. Juan D. Sr. | False | False | True |
| 4 | Id2331 | 1998 | Jul | 27 | 0 | 637.26 | 3 | 3 | R1013 | 22.340 | 5.57 | 0 | 0 | 0 | 1 | 0 | Brietzke, Mr. Jordan | False | False | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2329 | Id6 | 1962 | Aug | 4 | 0 | 52590.83 | 1 | 3 | R1011 | 32.800 | 6.59 | 0 | 0 | 0 | No major surgery | 1 | Baker, Mr. Russell B. | True | False | False |
| 2330 | Id5 | 1989 | Jun | 19 | 0 | 55135.40 | 1 | 2 | R1012 | 35.530 | 5.45 | 0 | 0 | 0 | No major surgery | 1 | Kadala, Ms. Kristyn | False | True | False |
| 2331 | Id4 | 1991 | Jun | 6 | 1 | 58571.07 | 1 | 3 | R1024 | 38.095 | 6.05 | 0 | 0 | 0 | No major surgery | 1 | Osborne, Ms. Kelsey | False | False | False |
| 2333 | Id2 | 1977 | Jun | 8 | 0 | 62592.87 | 2 | 3 | R1013 | 30.360 | 5.77 | 0 | 0 | 0 | No major surgery | 1 | Lehner, Mr. Matthew D | False | False | True |
| 2334 | Id1 | 1968 | Oct | 12 | 0 | 63770.43 | 1 | 3 | R1013 | 47.410 | 7.47 | 0 | 0 | 0 | No major surgery | 1 | Hawks, Ms. Kelly | False | False | True |
2325 rows × 20 columns
df['NumberOfMajorSurgeries'] = df['NumberOfMajorSurgeries'].replace('No major surgery',0)
df['NumberOfMajorSurgeries'].value_counts()
NumberOfMajorSurgeries 0 1070 1 961 2 272 3 22 Name: count, dtype: int64
df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year
df['year']
0 1992
1 1992
2 1993
3 1992
4 1998
...
2329 1962
2330 1989
2331 1991
2333 1977
2334 1968
Name: year, Length: 2325, dtype: int32
df['month'] = pd.to_datetime(df['month'], format='%b').dt.month
df['month']
0 7
1 11
2 6
3 9
4 7
..
2329 8
2330 6
2331 6
2333 6
2334 10
Name: month, Length: 2325, dtype: int32
df['DateInt'] = df['year'].astype(str) + df['month'].astype(str).str.zfill(2) + df['date'].astype(str).str.zfill(2)
df['DOB'] = pd.to_datetime(df.DateInt, format='%Y%m%d')
df.drop(['DateInt'], inplace=True, axis=1)
import datetime as dt
current_date = dt.datetime.now()
df['Age'] = (((current_date-df.DOB).dt.days)/365).astype(int)
df
| Customer ID | year | month | date | children | charges | Hospital tier | City tier | State ID | BMI | ... | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | name | State_ID_R1011 | State_ID_R1012 | State_ID_R1013 | DOB | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Id2335 | 1992 | 7 | 9 | 0 | 563.84 | 2 | 3 | R1013 | 17.580 | ... | 0 | 0 | 1 | 0 | German, Mr. Aaron K | False | False | True | 1992-07-09 | 31 |
| 1 | Id2334 | 1992 | 11 | 30 | 0 | 570.62 | 2 | 1 | R1013 | 17.600 | ... | 0 | 0 | 1 | 0 | Rosendahl, Mr. Evan P | False | False | True | 1992-11-30 | 31 |
| 2 | Id2333 | 1993 | 6 | 30 | 0 | 600.00 | 2 | 1 | R1013 | 16.470 | ... | 0 | 1 | 1 | 0 | Albano, Ms. Julie | False | False | True | 1993-06-30 | 30 |
| 3 | Id2332 | 1992 | 9 | 13 | 0 | 604.54 | 3 | 3 | R1013 | 17.700 | ... | 0 | 0 | 1 | 0 | Riveros Gonzalez, Mr. Juan D. Sr. | False | False | True | 1992-09-13 | 31 |
| 4 | Id2331 | 1998 | 7 | 27 | 0 | 637.26 | 3 | 3 | R1013 | 22.340 | ... | 0 | 0 | 1 | 0 | Brietzke, Mr. Jordan | False | False | True | 1998-07-27 | 25 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2329 | Id6 | 1962 | 8 | 4 | 0 | 52590.83 | 1 | 3 | R1011 | 32.800 | ... | 0 | 0 | 0 | 1 | Baker, Mr. Russell B. | True | False | False | 1962-08-04 | 61 |
| 2330 | Id5 | 1989 | 6 | 19 | 0 | 55135.40 | 1 | 2 | R1012 | 35.530 | ... | 0 | 0 | 0 | 1 | Kadala, Ms. Kristyn | False | True | False | 1989-06-19 | 34 |
| 2331 | Id4 | 1991 | 6 | 6 | 1 | 58571.07 | 1 | 3 | R1024 | 38.095 | ... | 0 | 0 | 0 | 1 | Osborne, Ms. Kelsey | False | False | False | 1991-06-06 | 32 |
| 2333 | Id2 | 1977 | 6 | 8 | 0 | 62592.87 | 2 | 3 | R1013 | 30.360 | ... | 0 | 0 | 0 | 1 | Lehner, Mr. Matthew D | False | False | True | 1977-06-08 | 46 |
| 2334 | Id1 | 1968 | 10 | 12 | 0 | 63770.43 | 1 | 3 | R1013 | 47.410 | ... | 0 | 0 | 0 | 1 | Hawks, Ms. Kelly | False | False | True | 1968-10-12 | 55 |
2325 rows × 22 columns
def gen(x):
if 'Ms.' in x:
return 0
else:
return 1
df['Gender'] = df['name'].map(gen)
df['Gender']
0 1
1 1
2 0
3 1
4 1
..
2329 1
2330 0
2331 0
2333 1
2334 0
Name: Gender, Length: 2325, dtype: int64
# Histogram
sns.histplot(df['charges'])
<Axes: xlabel='charges', ylabel='Count'>
# box and whisker plot
sns.boxplot(df['charges'])
<Axes: ylabel='charges'>
import warnings
warnings.filterwarnings("ignore")
# Swarm Plot
plt.figure(figsize=(30,10))
sns.swarmplot(x='year', y='charges', hue="Gender", data=df)
<Axes: xlabel='year', ylabel='charges'>
sns.countplot(data = df,x = 'Hospital tier', hue = 'Gender')
<Axes: xlabel='Hospital tier', ylabel='count'>
df[df['Hospital tier']==1].charges.median()
32097.434999999998
df[df['Hospital tier']==2].charges.median()
7168.76
df[df['Hospital tier']==3].charges.median()
10676.83
import plotly.express as px
df1 = pd.DataFrame(dict(
r=[32097.434999999998,7168.76,10676.83],
theta=['Tier 1','Tier 2','Tier 3']))
fig = px.line_polar(df1, r='r', theta='theta', line_close=True)
fig.show()
city_freq = df["City tier"].value_counts().rename_axis('City&hospital_tier').reset_index(name='city_counts')
hospital_freq = df["Hospital tier"].value_counts().rename_axis('City&hospital_tier').reset_index(name='hospital_counts')
freq_table = pd.merge(city_freq, hospital_freq, on = 'City&hospital_tier')
freq_table
| City&hospital_tier | city_counts | hospital_counts | |
|---|---|---|---|
| 0 | 2 | 807 | 1334 |
| 1 | 3 | 789 | 691 |
| 2 | 1 | 729 | 300 |
x = freq_table['City&hospital_tier']
y1 = freq_table['city_counts']
y2 = freq_table['hospital_counts']
# plot bars in stack manner
plt.bar(x, y1, color='r')
plt.bar(x, y2, bottom=y1, color='b')
plt.show()
from scipy.stats import ttest_1samp
from scipy.stats import friedmanchisquare
data1 = [32097.43]
data2 = [7168.76]
data3 = [10676.83]
stat, p = friedmanchisquare(data1, data2, data3)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
stat=2.000, p=0.368 Probably the same distribution
print("median cost of tier 1 city:", df[df["City tier"]==1].charges.median())
print("median cost of tier 2 city:", df[df["City tier"]==2].charges.median())
print("median cost of tier 3 city:", df[df["City tier"]==3].charges.median())
median cost of tier 1 city: 10027.15 median cost of tier 2 city: 8968.33 median cost of tier 3 city: 9880.07
data1 = [10027.15]
data2 = [8968.33]
data3 = [9880.07]
stat, p = friedmanchisquare(data1, data2, data3)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
stat=2.000, p=0.368 Probably the same distribution
print("median cost of smoker:", df[df["smoker"]==1].charges.median())
print("median cost of non smoker:", df[df["smoker"]==0].charges.median())
median cost of smoker: 34125.475 median cost of non smoker: 7537.16
from scipy.stats import kruskal
data1 = [34125.475]
data2 = [7537.16]
stat, p = kruskal(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably the same distribution')
else:
print('Probably different distributions')
stat=1.000, p=0.317 Probably the same distribution
from scipy.stats import chi2_contingency
table = [[df["Heart Issues"].value_counts()],[df["smoker"].value_counts()]]
stat, p, dof, expected = chi2_contingency(table)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
print('Probably independent')
else:
print('Probably dependent')
stat=191.145, p=0.000 Probably dependent
df.drop(["Customer ID","State ID",'name', 'year', 'month', 'date', 'DOB'], inplace=True, axis=1)
df.head()
| children | charges | Hospital tier | City tier | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | State_ID_R1011 | State_ID_R1012 | State_ID_R1013 | Age | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 563.84 | 2 | 3 | 17.58 | 4.51 | 0 | 0 | 0 | 1 | 0 | False | False | True | 31 | 1 |
| 1 | 0 | 570.62 | 2 | 1 | 17.60 | 4.39 | 0 | 0 | 0 | 1 | 0 | False | False | True | 31 | 1 |
| 2 | 0 | 600.00 | 2 | 1 | 16.47 | 6.35 | 0 | 0 | 1 | 1 | 0 | False | False | True | 30 | 0 |
| 3 | 0 | 604.54 | 3 | 3 | 17.70 | 6.28 | 0 | 0 | 0 | 1 | 0 | False | False | True | 31 | 1 |
| 4 | 0 | 637.26 | 3 | 3 | 22.34 | 5.57 | 0 | 0 | 0 | 1 | 0 | False | False | True | 25 | 1 |
Dropped those columns because they are not usable to model building
df.shape
(2325, 16)
df.head()
| children | charges | Hospital tier | City tier | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | State_ID_R1011 | State_ID_R1012 | State_ID_R1013 | Age | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 563.84 | 2 | 3 | 17.58 | 4.51 | 0 | 0 | 0 | 1 | 0 | False | False | True | 31 | 1 |
| 1 | 0 | 570.62 | 2 | 1 | 17.60 | 4.39 | 0 | 0 | 0 | 1 | 0 | False | False | True | 31 | 1 |
| 2 | 0 | 600.00 | 2 | 1 | 16.47 | 6.35 | 0 | 0 | 1 | 1 | 0 | False | False | True | 30 | 0 |
| 3 | 0 | 604.54 | 3 | 3 | 17.70 | 6.28 | 0 | 0 | 0 | 1 | 0 | False | False | True | 31 | 1 |
| 4 | 0 | 637.26 | 3 | 3 | 22.34 | 5.57 | 0 | 0 | 0 | 1 | 0 | False | False | True | 25 | 1 |
correlation = df.corr()
correlation
| children | charges | Hospital tier | City tier | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | State_ID_R1011 | State_ID_R1012 | State_ID_R1013 | Age | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| children | 1.000000 | 0.055901 | -0.052438 | -0.015760 | -0.005339 | -0.101379 | 0.023984 | -0.142040 | -0.027880 | -0.113161 | 0.017713 | 0.011666 | 0.005247 | -0.013834 | -0.005457 | 0.011205 |
| charges | 0.055901 | 1.000000 | -0.446687 | 0.035300 | 0.346730 | 0.139697 | 0.049299 | -0.127028 | -0.022522 | 0.053308 | 0.838462 | 0.286956 | -0.074636 | -0.150634 | 0.304395 | 0.034069 |
| Hospital tier | -0.052438 | -0.446687 | 1.000000 | -0.039755 | -0.104771 | 0.057855 | 0.053376 | 0.011729 | -0.021429 | 0.033230 | -0.474077 | -0.114685 | 0.020272 | 0.002455 | 0.133771 | 0.041261 |
| City tier | -0.015760 | 0.035300 | -0.039755 | 1.000000 | 0.038123 | -0.005404 | 0.023152 | 0.002970 | -0.018639 | 0.027937 | 0.032034 | 0.036049 | -0.018253 | 0.002766 | -0.008070 | 0.054073 |
| BMI | -0.005339 | 0.346730 | -0.104771 | 0.038123 | 1.000000 | -0.006920 | 0.017129 | 0.015893 | -0.020235 | 0.018851 | 0.107126 | 0.115671 | 0.017939 | -0.208744 | 0.049260 | 0.079930 |
| HBA1C | -0.101379 | 0.139697 | 0.057855 | -0.005404 | -0.006920 | 1.000000 | 0.007699 | -0.159855 | -0.170921 | -0.091594 | 0.007257 | 0.015525 | -0.019513 | 0.033453 | 0.460558 | -0.027339 |
| Heart Issues | 0.023984 | 0.049299 | 0.053376 | 0.023152 | 0.017129 | 0.007699 | 1.000000 | -0.140269 | 0.111190 | 0.206147 | -0.007159 | 0.005852 | 0.021770 | -0.027967 | 0.192273 | 0.010277 |
| Any Transplants | -0.142040 | -0.127028 | 0.011729 | 0.002970 | 0.015893 | -0.159855 | -0.140269 | 1.000000 | -0.114677 | 0.158593 | -0.025101 | -0.058553 | -0.066453 | 0.064563 | -0.381084 | -0.012737 |
| Cancer history | -0.027880 | -0.022522 | -0.021429 | -0.018639 | -0.020235 | -0.170921 | 0.111190 | -0.114677 | 1.000000 | 0.204208 | 0.006415 | 0.011919 | 0.058222 | -0.066475 | -0.101073 | 0.009359 |
| NumberOfMajorSurgeries | -0.113161 | 0.053308 | 0.033230 | 0.027937 | 0.018851 | -0.091594 | 0.206147 | 0.158593 | 0.204208 | 1.000000 | 0.017199 | 0.000208 | -0.002098 | -0.002056 | 0.151442 | -0.003349 |
| smoker | 0.017713 | 0.838462 | -0.474077 | 0.032034 | 0.107126 | 0.007257 | -0.007159 | -0.025101 | 0.006415 | 0.017199 | 1.000000 | 0.336112 | -0.106998 | -0.094547 | 0.011939 | 0.020968 |
| State_ID_R1011 | 0.011666 | 0.286956 | -0.114685 | 0.036049 | 0.115671 | 0.015525 | 0.005852 | -0.058553 | 0.011919 | 0.000208 | 0.336112 | 1.000000 | -0.327054 | -0.341085 | 0.008022 | -0.075234 |
| State_ID_R1012 | 0.005247 | -0.074636 | 0.020272 | -0.018253 | 0.017939 | -0.019513 | 0.021770 | -0.066453 | 0.058222 | -0.002098 | -0.106998 | -0.327054 | 1.000000 | -0.340296 | -0.005229 | -0.077093 |
| State_ID_R1013 | -0.013834 | -0.150634 | 0.002455 | 0.002766 | -0.208744 | 0.033453 | -0.027967 | 0.064563 | -0.066475 | -0.002056 | -0.094547 | -0.341085 | -0.340296 | 1.000000 | -0.011926 | -0.029639 |
| Age | -0.005457 | 0.304395 | 0.133771 | -0.008070 | 0.049260 | 0.460558 | 0.192273 | -0.381084 | -0.101073 | 0.151442 | 0.011939 | 0.008022 | -0.005229 | -0.011926 | 1.000000 | -0.007350 |
| Gender | 0.011205 | 0.034069 | 0.041261 | 0.054073 | 0.079930 | -0.027339 | 0.010277 | -0.012737 | 0.009359 | -0.003349 | 0.020968 | -0.075234 | -0.077093 | -0.029639 | -0.007350 | 1.000000 |
plt.figure(figsize=(15,10))
sns.heatmap(correlation, annot=True, linewidth=.5, cmap="crest")
plt.show()
• Perform the stratified 5-fold cross-validation technique for model building and validation • Use standardization and hyperparameter tuning effectively • Use sklearn-pipelines • Use appropriate regularization techniques to address the bias-variance trade-off
# lets first seperate the input and output data.
x = df.drop(["charges"], axis=1)
y = df[['charges']]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.20, random_state=10)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)
# Stochastic gradient descent optimizer
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2,0.3,0.4,0.5,
0.6,0.7,0.8,0.9,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,
9.0,10.0,20,50,100,500,1000],
'penalty': ['l2', 'l1', 'elasticnet']}
sgd = SGDRegressor()
# Cross Validation
folds = 5
model_cv = GridSearchCV(estimator = sgd,
param_grid = params,
scoring = 'neg_mean_absolute_error',
cv = folds,
return_train_score = True,
verbose = 1)
model_cv.fit(x_train,y_train)
Fitting 5 folds for each of 84 candidates, totalling 420 fits
GridSearchCV(cv=5, estimator=SGDRegressor(),
param_grid={'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3,
0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0,
4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50,
100, 500, 1000],
'penalty': ['l2', 'l1', 'elasticnet']},
return_train_score=True, scoring='neg_mean_absolute_error',
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=SGDRegressor(),
param_grid={'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3,
0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0,
4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50,
100, 500, 1000],
'penalty': ['l2', 'l1', 'elasticnet']},
return_train_score=True, scoring='neg_mean_absolute_error',
verbose=1)SGDRegressor()
SGDRegressor()
model_cv.best_params_
{'alpha': 0.7, 'penalty': 'l1'}
sgd = SGDRegressor(alpha= 100, penalty= 'l1')
sgd.fit(x_train, y_train)
SGDRegressor(alpha=100, penalty='l1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SGDRegressor(alpha=100, penalty='l1')
sgd.score(x_test, y_test)
0.870999188185268
y_pred = sgd.predict(x_test)
from sklearn.metrics import mean_squared_error, mean_absolute_error
sgd_mae = mean_absolute_error(y_test, y_pred)
sgd_mse = mean_squared_error(y_test, y_pred)
sgd_rmse = sgd_mse*(1/2.0)
print("MAE:", sgd_mae)
print("MSE:", sgd_mse)
print("RMSE:", sgd_rmse)
MAE: 2651.366890864599 MSE: 18775743.771674618 RMSE: 9387871.885837309
importance = sgd.coef_
pd.DataFrame(importance, index = x.columns, columns=['Inportance Score'])
| Inportance Score | |
|---|---|
| children | 344.381045 |
| Hospital tier | -1035.362293 |
| City tier | 0.000000 |
| BMI | 2666.030914 |
| HBA1C | 121.528657 |
| Heart Issues | 0.000000 |
| Any Transplants | 0.000000 |
| Cancer history | 0.000000 |
| NumberOfMajorSurgeries | 0.000000 |
| smoker | 8976.649018 |
| State_ID_R1011 | 0.000000 |
| State_ID_R1012 | 0.000000 |
| State_ID_R1013 | -263.622647 |
| Age | 3407.989315 |
| Gender | 0.000000 |
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(x_train, y_train)
RandomForestRegressor(n_estimators=1000, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(n_estimators=1000, random_state=42)
score = rf.score(x_test,y_test)
score
0.9016674828639792
y_pred = rf.predict(x_test)
rf_mae = mean_absolute_error(y_test, y_pred)
rf_mae
1942.660224473113
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators = 1000, random_state = 42)
gbr.fit(x_train, y_train)
GradientBoostingRegressor(n_estimators=1000, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingRegressor(n_estimators=1000, random_state=42)
score = gbr.score(x_test,y_test)
score
0.8654331231806346
y_pred = gbr.predict(x_test)
gbr_mae = mean_absolute_error(y_test, y_pred)
gbr_mae
2439.1036419593333
date = str(19881228)
date1 = pd.to_datetime(date, format = "%Y%m%d")
current_date = dt.datetime.now()
current_date
datetime.datetime(2024, 2, 10, 13, 41, 7, 945468)
age = (current_date - date1)
age
Timedelta('12827 days 13:41:07.945468')
age = int(12421/365)
age
34
So the age of Christopher, Ms. Jayna is 34
height_m = 170/100
height_sq = height_m*height_m
BMI = 85/height_sq
np.round(BMI,2)
29.41
BMI is 29.41
df.columns
Index(['children', 'charges', 'Hospital tier', 'City tier', 'BMI', 'HBA1C',
'Heart Issues', 'Any Transplants', 'Cancer history',
'NumberOfMajorSurgeries', 'smoker', 'State_ID_R1011', 'State_ID_R1012',
'State_ID_R1013', 'Age', 'Gender'],
dtype='object')
df.drop(['charges'],inplace=True,axis=1)
list = [[2,1,1,29.41,5.8,0,0,0,0,1,1,0,0,34,0]]
df = pd.DataFrame(list, columns = ['children','Hospital tier', 'City tier', 'BMI', 'HBA1C','Heart Issues', 'Any Transplants',
'Cancer history','NumberOfMajorSurgeries', 'smoker', 'State_ID_R1011', 'State_ID_R1012',
'State_ID_R1013', 'age', 'gender'] )
df
| children | Hospital tier | City tier | BMI | HBA1C | Heart Issues | Any Transplants | Cancer history | NumberOfMajorSurgeries | smoker | State_ID_R1011 | State_ID_R1012 | State_ID_R1013 | age | gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 1 | 1 | 29.41 | 5.8 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 34 | 0 |
Hospitalization_cost = []
# Now lets predict the hospitalization cost through SGDRegressor
Cost1 = sgd.predict(df)
Hospitalization_cost.append(Cost1)
# Now lets predict the hospitalization cost through Random Forest
Cost2 = rf.predict(df)
Hospitalization_cost.append(Cost2)
# Now lets predict the hospitalization cost throug Extreme gradient Booster
Cost3 = gbr.predict(df)
Hospitalization_cost.append(Cost3)
Hospitalization_cost
[array([217142.19936608]), array([45115.32171]), array([49915.6762532])]
avg_cost = np.mean(Hospitalization_cost)
avg_cost
104057.73244309246
The average predicted hospitalization cost is 104814.30